{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Chapter 12: Testing and Debugging in Polars"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Debugging chained operations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How it works..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars as pl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lf = pl.scan_csv('../data/pokemon.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    lf\n",
    "    .with_columns(\n",
    "        pl.col('Attack').rank(method='dense').alias('Atk Rank'),\n",
    "        pl.col('Defense').rank(method='dense').alias('Def Rank'),\n",
    "        pl.col('Speed').rank(method='dense').alias('Spe Rank'),\n",
    "    )\n",
    "    .select(\n",
    "        'Name',\n",
    "        'Total',\n",
    "        'Attack',\n",
    "        'Defense',\n",
    "        'Speed',\n",
    "        pl.col('^*Rank$')\n",
    "    )\n",
    "    .sort('Total')\n",
    "    .head()\n",
    "    .collect()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    lf\n",
    "    .with_columns(\n",
    "        pl.col('Attack').rank(method='dense').alias('Atk Rank'),\n",
    "        pl.col('Defense').rank(method='dense').alias('Def Rank'),\n",
    "        pl.col('Speed').rank(method='dense').alias('Spe Rank'),\n",
    "    )\n",
    "    .select(\n",
    "        'Name',\n",
    "        'Total',\n",
    "        'Attack',\n",
    "        'Deffense',\n",
    "        'Speed',\n",
    "        pl.col('^*Rank$')\n",
    "    )\n",
    "    .sort('Total')\n",
    "    .head()\n",
    "    .collect()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    lf\n",
    "    .with_columns(\n",
    "        pl.col('Attack').rank(method='dense').alias('Atk Rank'),\n",
    "        pl.col('Defense').rank(method='dense').alias('Def Rank'),\n",
    "        pl.col('Speed').rank(method='dense').alias('Spe Rank'),\n",
    "    )\n",
    "    # .select(\n",
    "    #     'Name',\n",
    "    #     'Total',\n",
    "    #     'Attack',\n",
    "    #     'Deffense',\n",
    "    #     'Speed',\n",
    "    #     pl.col('^*Rank$')\n",
    "    # )\n",
    "    .sort('Total')\n",
    "    .head()\n",
    "    .collect()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    lf\n",
    "    .collect()\n",
    "    .with_columns(\n",
    "        pl.col('Attack').rank(method='dense').alias('Atk Rank'),\n",
    "        pl.col('Defense').rank(method='dense').alias('Def Rank'),\n",
    "        pl.col('Speed').rank(method='dense').alias('Spe Rank'),\n",
    "    )\n",
    "    .select(\n",
    "        'Name',\n",
    "        'Total',\n",
    "        'Attack',\n",
    "        'Deffense',\n",
    "        'Speed',\n",
    "        pl.col('^*Rank$')\n",
    "    )\n",
    "    .sort('Total')\n",
    "    .head()\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### There is more..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def add_ranks(lf: pl.LazyFrame) -> pl.LazyFrame:\n",
    "    return (\n",
    "        lf\n",
    "        .with_columns(\n",
    "            pl.col('Attack').rank(method='dense').alias('Atk Rank'),\n",
    "            pl.col('Defense').rank(method='dense').alias('Def Rank'),\n",
    "            pl.col('Speed').rank(method='dense').alias('Spe Rank'),\n",
    "        )\n",
    "    )\n",
    "\n",
    "def keep_cols(lf: pl.LazyFrame) -> pl.LazyFrame:\n",
    "    return (\n",
    "        lf\n",
    "        .select(\n",
    "            'Name',\n",
    "            'Total',\n",
    "            'Attack',\n",
    "            'Defense',\n",
    "            'Speed',\n",
    "            pl.col('^*Rank$')\n",
    "        )\n",
    "    )\n",
    "\n",
    "(\n",
    "    lf\n",
    "    .pipe(add_ranks)\n",
    "    .pipe(keep_cols)\n",
    "    .sort('Total')\n",
    "    .head()\n",
    "    .collect()\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Inspecting and optimizing the query plan"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How to do it..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def keep_grass_or_fire(lf):\n",
    "    accepted_types = ['Grass', 'Fire']\n",
    "    return (\n",
    "        lf\n",
    "        .filter(\n",
    "            (pl.col('Type 1').is_in(accepted_types))\n",
    "            | (pl.col('Type 2').is_in(accepted_types))\n",
    "        )\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    lf\n",
    "    .pipe(add_ranks)\n",
    "    .pipe(keep_grass_or_fire)\n",
    "    .pipe(keep_cols)\n",
    "    .show_graph()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    lf\n",
    "    .pipe(add_ranks)\n",
    "    .pipe(keep_grass_or_fire)\n",
    "    .pipe(keep_cols)\n",
    "    .show_graph(optimized=False)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\n",
    "    lf\n",
    "    .pipe(add_ranks)\n",
    "    .pipe(keep_grass_or_fire)\n",
    "    .pipe(keep_cols)\n",
    "    .explain()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\n",
    "    lf\n",
    "    .pipe(add_ranks)\n",
    "    .pipe(keep_grass_or_fire)\n",
    "    .pipe(keep_cols)\n",
    "    .explain(optimized=False)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    lf\n",
    "    .pipe(keep_grass_or_fire)\n",
    "    .pipe(keep_cols)    \n",
    "    .pipe(add_ranks)\n",
    "    .show_graph()\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### There is more..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\n",
    "    lf\n",
    "    .pipe(keep_grass_or_fire)\n",
    "    .pipe(keep_cols)    \n",
    "    .pipe(add_ranks)\n",
    "    .explain(streaming=True)\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Testing data quality with cuallee"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How to do it..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from cuallee import Check, CheckLevel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = lf.collect()\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "check = Check(CheckLevel.WARNING, 'Completeness')\n",
    "(\n",
    "    check\n",
    "    .is_complete('Name')\n",
    "    .is_unique('Name')\n",
    "    .validate(df)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "check = Check(CheckLevel.WARNING, 'Accepted Values')\n",
    "accepted_types = (\n",
    "    lf.select('Type 1')\n",
    "    .unique()\n",
    "    .collect()\n",
    "    .to_series()\n",
    "    .to_list()\n",
    ")\n",
    "\n",
    "(\n",
    "    check\n",
    "    .is_contained_in('Type 1', set(accepted_types))\n",
    "    .validate(df)\n",
    "    .select('check', 'column', 'rule', 'status')\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "check = Check(CheckLevel.WARNING, 'Validation on Stats')\n",
    "stats_cols = [\n",
    "    'HP',\n",
    "    'Attack',\n",
    "    'Defense',\n",
    "    'Sp. Atk',\n",
    "    'Sp. Def',\n",
    "    'Speed'\n",
    "]\n",
    "res_cols = [\n",
    "    'check', \n",
    "    'column', \n",
    "    'rule', \n",
    "    'rows', \n",
    "    'violations', \n",
    "    'pass_rate', \n",
    "    'status'\n",
    "]\n",
    "\n",
    "(\n",
    "    check\n",
    "    .is_complete(stats_cols)\n",
    "    .is_greater_than(stats_cols, 0)\n",
    "    .validate(df)\n",
    "    .select(res_cols)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "check = Check(CheckLevel.WARNING, 'Completeness')\n",
    "cols = ['Name', 'Type 1', 'Type 2']\n",
    "(\n",
    "    check\n",
    "    .are_complete(cols)\n",
    "    .are_unique(cols)\n",
    "    .validate(df)\n",
    "    .select(\n",
    "        'check', \n",
    "        'column', \n",
    "        'rule', \n",
    "        'rows', \n",
    "        'violations', \n",
    "        'pass_rate', \n",
    "        'status'\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### There is more..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "check = Check(CheckLevel.WARNING, 'Completeness')\n",
    "result = (\n",
    "    check\n",
    "    .is_complete('Type 2')\n",
    "    .validate(df)\n",
    "    .select('status')[0,0]=='PASS'\n",
    ")\n",
    "assert result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Getting started with Pytest"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Please refer to `demo.py` and `test_demo.py`"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
